{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip3 install git+https://github.com/huseinzol05/malaya.git@4.6.1 --no-deps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-tpu.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya\n",
    "from malaya.preprocessing import Tokenizer\n",
    "from malaya.text.function import case_of\n",
    "from malaya.augmentation import (\n",
    "    replace_similar_consonants, \n",
    "    replace_similar_vowels, \n",
    "    socialmedia_form,\n",
    "    vowel_alternate)\n",
    "from malaya.text import rules\n",
    "from collections import defaultdict\n",
    "import random\n",
    "import re\n",
    "import tensorflow as tf\n",
    "from malaya.text.tatabahasa import alphabet, consonants, vowels\n",
    "from malaya.text.function import augmentation_textcleaning, simple_textcleaning\n",
    "\n",
    "def cleaning_row(string):\n",
    "    string = string.replace('\\n', ' ').replace('\\t', ' ')\n",
    "    string = re.sub(r'[ ]+', ' ', string).strip()\n",
    "    return string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "replace_normalizer = defaultdict(list)\n",
    "for k, v in rules.rules_normalizer.items():\n",
    "    if v.count(' ') == 0:\n",
    "        replace_normalizer[v].append(k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['juge', 'jugak']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def socialmedia_form(word: str):\n",
    "    \"\"\"\n",
    "    augmenting a word into socialmedia form.\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    word: str\n",
    "\n",
    "    Returns\n",
    "    -------\n",
    "    result: List[str]\n",
    "    \"\"\"\n",
    "\n",
    "    word = simple_textcleaning(word)\n",
    "    if not len(word):\n",
    "        raise ValueError('word is too short to augment shortform.')\n",
    "\n",
    "    results = []\n",
    "\n",
    "    if len(word) > 1:\n",
    "\n",
    "        if word[-1] == 'a' and word[-2] in consonants:\n",
    "            results.append(word[:-1] + 'e')\n",
    "\n",
    "        if word[0] == 'f' and word[-1] == 'r':\n",
    "            results.append('p' + word[1:])\n",
    "    \n",
    "        if word[-2] in consonants and word[-1] in vowels:\n",
    "            results.append(word + 'k')\n",
    "\n",
    "        if word[-2] in vowels and word[-1] == 'h':\n",
    "            results.append(word[:-1])\n",
    "\n",
    "    if len(word) > 2:\n",
    "        if word[-3] in consonants and word[-2:] == 'ar':\n",
    "            results.append(word[:-2] + 'o')\n",
    "\n",
    "        if word[0] == 'h' and word[1] in vowels and word[2] in consonants:\n",
    "            results.append(word[1:])\n",
    "\n",
    "        if word[-3] in consonants and word[-2:] == 'ng':\n",
    "            results.append(word[:-2] + 'g')\n",
    "\n",
    "        if word[1:3] == 'ng':\n",
    "            results.append(word[:1] + x[2:])\n",
    "\n",
    "    return list(set(results))\n",
    "\n",
    "socialmedia_form('juga')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'makan ayam'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def random_slide(string, min_n = 2):\n",
    "    splitted = string.split()\n",
    "    n = random.randint(min_n, len(splitted))\n",
    "    i = random.randint(0, len(splitted) - n)\n",
    "    return ' '.join(splitted[i: i + n])\n",
    "\n",
    "random_slide('Husein makan ayam di kampung Jawa juga')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['word']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'word'.split('-')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'abaf kel-14-14-14-14'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = Tokenizer(duration = False, date = False).tokenize\n",
    "\n",
    "def augment(string):\n",
    "    \n",
    "    r = []\n",
    "    for word in tokenizer(string):\n",
    "        original_word = word\n",
    "        word_lower = word.lower()\n",
    "        try:\n",
    "            if word.istitle() or word.isupper():\n",
    "                if random.random() >= 0.3:\n",
    "                    word = case_of(word)(random.choice(replace_normalizer[word_lower]))\n",
    "            else:\n",
    "                splitted = word_lower.split('-')\n",
    "                if len(splitted) > 1:\n",
    "                    word = splitted[0]\n",
    "                    after = '-'.join(splitted[1:])\n",
    "                else:\n",
    "                    after = ''\n",
    "                s = socialmedia_form(word_lower)\n",
    "                if len(s):\n",
    "                    word = case_of(word)(random.choice(s))\n",
    "                else:\n",
    "                    if word_lower in replace_normalizer and random.random() >= 0.3:\n",
    "                        word = case_of(word)(random.choice(replace_normalizer[word_lower]))\n",
    "\n",
    "                word = case_of(word)(vowel_alternate(word, 0.7))\n",
    "                word = case_of(word)(replace_similar_consonants(word, 0.95))\n",
    "                word = case_of(word)(replace_similar_vowels(word, 0.8))\n",
    "            \n",
    "                if len(after):\n",
    "                    word = f'{word}-{after}'\n",
    "                \n",
    "        except Exception as e:\n",
    "            word = original_word\n",
    "            pass\n",
    "        \n",
    "        r.append(word)\n",
    "    return ' '.join(r)\n",
    "\n",
    "augment('abad ke-14-14-14-14')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Husein makn ayam dik kf Jawa jugak .'"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = \"\"\"\n",
    "Husein makan ayam di kampung Jawa juga\n",
    "\"\"\"\n",
    "splitted = malaya.text.function.split_into_sentences(string)\n",
    "augment(splitted[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Husein makan aym dik kf Jawa .'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = \"\"\"\n",
    "Husein makan ayam di kampung Jawa\n",
    "\"\"\"\n",
    "splitted = malaya.text.function.split_into_sentences(string)\n",
    "augment(splitted[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Husein makan ayam di kampung Jawa.'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "splitted[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "files = ['/home/husein/pure-text/dumping-parliament.txt',]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10000"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(files[0]) as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "data = [i for i in data if len(i) >= 2]\n",
    "data = random.sample(data, 10000)\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "fast_text = malaya.language_detection.fasttext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['other']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fast_text.predict(['តើប្រព័ន្ធប្រតិបត្តិការណាដែលត្រូវគ្នាជាមួយកម្មវិធីធនាគារអេប៊ីអេ។'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "def loop(strings):\n",
    "    results = []\n",
    "    for i in tqdm(range(len(strings))):\n",
    "        try:\n",
    "            if fast_text.predict([strings[i]])[0] == 'other':\n",
    "                continue\n",
    "            if random.random() > 0.6:\n",
    "                s = random_slide(strings[i])\n",
    "                if not len(s):\n",
    "                    s = strings[i]\n",
    "            else:\n",
    "                s = strings[i]\n",
    "            t = ' '.join(tokenizer(s))\n",
    "            if random.random() >= 0.2:\n",
    "                row = augment(s)\n",
    "                results.append((row, t))\n",
    "            else:\n",
    "                results.append((t, t))\n",
    "        except:\n",
    "            pass\n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['serbak', 'serbe']"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "socialmedia_form('serba')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10/10 [00:00<00:00, 393.36it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[('KO CHUNG SEN [ KAMPAR ] m ontak MENTERI KEWANGAN ( a ) imbuhan anual dn bunus yang d ibayo kpafak Leader Pegawai Eksekutif 1 MDB sjk ia ditibihkan padak thn 2009 ; dn ( b ) imbihn anual Pengerusi 1 MDB sjk taon 2009 , dan imbuhan anual stp Ahli Lembaga Pengarah 1 MDB sejk 2009 .',\n",
       "  'KO CHUNG SEN [ KAMPAR ] m inta MENTERI KEWANGAN ( a ) imbuhan tahunan dan bonus yang d ibayar kepada Ketua Pegawai Eksekutif 1 MDB sejak ia ditubuhkan pada tahun 2009 ; dan ( b ) imbuhan tahunan Pengerusi 1 MDB sejak tahun 2009 , dan imbuhan tahunan setiap Ahli Lembaga Pengarah 1 MDB sejak 2009 .'),\n",
       " ('Justeru , pade mase akn mao , pasarn mudal , khususnye psaran sukuk Msia dojngkak akn tris menjadik smbt pnting dalam penjanaan damak bgk negatak .',\n",
       "  'Justeru , pada masa akan datang , pasaran modal , khususnya pasaran sukuk Malaysia dijangka akan terus menjadi sumber penting dalam penjanaan dana bagi negara .'),\n",
       " ('SO ALAN PEMBERITAHUAN PERTANYAAN DEWAN RAKYAT , PARLIMEN MALAYSIA .',\n",
       "  'SO ALAN PEMBERITAHUAN PERTANYAAN DEWAN RAKYAT , PARLIMEN MALAYSIA .'),\n",
       " ('Selain itik , Gemen juge brstjuk menaikkn kado penggunaan .',\n",
       "  'Selain itu , Kerajaan juga bersetuju menaikkan kadar penggunaan .'),\n",
       " ('78 perats respon datk faslitik lshatan yanckh terlibat dng pelncongan kesohatan dam oa djangkak akn trisss meningkay sehingge penghijung thun',\n",
       "  '78 peratus respon dari fasiliti kesihatan yang terlibat dengan pelancongan kesihatan dan ia dijangka akan terus meningkat sehingga penghujung tahun'),\n",
       " ('dilaksnalan adala spertik berlut : i .',\n",
       "  'dilaksanakan adalah seperti berikut : i .'),\n",
       " ('pgpejai * f a o r is m P B T ^ # K a tk a o t i ^ .',\n",
       "  'PGpejai * f a i r iS m P B T ^ # K a te a o r i ^ .'),\n",
       " ('April 2015 slarik fengn halak tujuk Nasional dlm Rancangan Msia Ke-Sebelas ( RMK-11 ) Area \" ( KRA ) dik bwa Teras Strategik utk Pengukuhan b ) Inisiatit Program',\n",
       "  'April 2015 selari dengan hala tuju Nasional dalam Rancangan Malaysia Ke-Sebelas ( RMK-11 ) Area \" ( KRA ) di bawah Teras Strategik untuk Pengukuhan b ) Inisiatit Program'),\n",
       " ('srte kaktangan KMJ tla dsarng penyakot Tibi',\n",
       "  'serta kakitangan KMJ telah disaring penyakit Tibi'),\n",
       " (\"NIBONG TEBAL ] SOALAN : YB Dato ' Mansor bin Othman [ Nibong Tebal ] mintak MENTERI KESIHATAN mnytalan bilangn osptal yg tetlibat fengn indstrik pelancongan perubatm srte\",\n",
       "  \"NIBONG TEBAL ] SOALAN : YB Dato ' Mansor bin Othman [ Nibong Tebal ] minta MENTERI KESIHATAN menyatakan bilangan hospital yang terlibat dengan industri pelancongan perubatan serta\")]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loop(data[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 625/625 [00:05<00:00, 113.43it/s]\n",
      "100%|██████████| 625/625 [00:06<00:00, 93.38it/s] \n",
      "100%|██████████| 625/625 [00:08<00:00, 77.22it/s]]\n",
      "100%|██████████| 625/625 [00:08<00:00, 74.39it/s] \n",
      "100%|██████████| 625/625 [00:08<00:00, 70.24it/s]\n",
      "100%|██████████| 625/625 [00:08<00:00, 70.55it/s]\n",
      "100%|██████████| 625/625 [00:09<00:00, 66.76it/s]\n",
      "100%|██████████| 625/625 [00:09<00:00, 64.75it/s]\n",
      "100%|██████████| 625/625 [00:09<00:00, 64.74it/s]\n",
      "100%|██████████| 625/625 [00:09<00:00, 64.56it/s]\n",
      "100%|██████████| 625/625 [00:09<00:00, 66.46it/s]]\n",
      "100%|██████████| 625/625 [00:09<00:00, 65.29it/s]\n",
      "100%|██████████| 625/625 [00:09<00:00, 62.59it/s]]\n",
      "100%|██████████| 625/625 [00:09<00:00, 64.43it/s]]\n",
      "100%|██████████| 625/625 [00:10<00:00, 61.89it/s] \n",
      "100%|██████████| 625/625 [00:10<00:00, 61.43it/s] \n"
     ]
    }
   ],
   "source": [
    "import cleaning\n",
    "\n",
    "results1 = cleaning.multiprocessing(data, loop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 9742/9742 [00:00<00:00, 381197.20it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.707452268528023"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "not_same = 0\n",
    "for r in tqdm(results1):\n",
    "    if r[0] != r[1]:\n",
    "        not_same += 1\n",
    "\n",
    "not_same / len(results1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('testset-spelling-augmentation.json', 'w') as fopen:\n",
    "    json.dump(results1, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "b2_application_key_id = os.environ['b2_application_key_id']\n",
    "b2_application_key = os.environ['b2_application_key']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "from b2sdk.v1 import *\n",
    "info = InMemoryAccountInfo()\n",
    "b2_api = B2Api(info)\n",
    "application_key_id = b2_application_key_id\n",
    "application_key = b2_application_key\n",
    "b2_api.authorize_account(\"production\", application_key_id, application_key)\n",
    "file_info = {'how': 'good-file'}\n",
    "b2_bucket = b2_api.get_bucket_by_name('malay-dataset')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<b2sdk.file_version.FileVersionInfo at 0x7f85857decc0>"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b2_bucket.upload_local_file(\n",
    "    local_file='testset-spelling-augmentation.json',\n",
    "    file_name='spelling/testset-spelling-augmentation.json',\n",
    "    file_infos=file_info,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<b2sdk.file_version.FileVersionInfo at 0x7f8584b2c860>"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b2_bucket.upload_local_file(\n",
    "    local_file='spelling-correction-news.tsv',\n",
    "    file_name='spelling/spelling-correction-news.tsv',\n",
    "    file_infos=file_info,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<b2sdk.file_version.FileVersionInfo at 0x7f8584cba588>"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b2_bucket.upload_local_file(\n",
    "    local_file='spelling-correction-wiki.tsv',\n",
    "    file_name='spelling/spelling-correction-wiki.tsv',\n",
    "    file_infos=file_info,\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
